# https://github.com/wikimedia/wikidata-query-rdf/blob/master/docs/getting-started.md
# assumes bash
# assumes java8 with JAVA_HOME defined and $JAVA_HOME/bin on $PATH
# will build in home dir (~)
cd ~
sudo apt-get install maven git
git clone --recurse-submodules https://gerrit.wikimedia.org/r/wikidata/query/rdf wikidata-query-rdf
cd wikidata-query-rdf
mvn package
cd dist/target
tar xvzf service-*-dist.tar.gz
cd service-*/

# assumes wikidata.nt.gz is in ~wikidata/
mkdir ~/wikidata/split

## OLD: WE WILL NOT MUNGE AS IT CHANGES THE DATA
# here we can add "-s" to remove sitelinks, "-l en" for only English text
#  but we keep all triples
#    this takes a while ...
# ./munge.sh -f ~/wikidata/wikidata.nt.gz -d ~/wikidata/split
## /OLD: WE WILL NOT MUNGE AS IT CHANGES THE DATA

# split file into smaller files with a million triples each
#  obviously might take a while
zcat ~/wikidata/wikidata.nt.gz | split -l 1000000 -a 4 -d --filter='gzip > $FILE.nt.gz' - ~/wikidata/split/wikidata-

nano runBlazegraph.sh
# (1) configure main memory here: HEAP_SIZE=${HEAP_SIZE:-"16g"}
# (2) add -Dorg.wikidata.query.rdf.tool.rdf.RdfRepository.timeout=600 \
#   to the java exec command
# (3) also change -Dcom.bigdata.rdf.sparql.ast.QueryHints.analyticMaxMemoryPerQuery=0
#   which removes per-query memory limits

# you may want to screen this; be sure to log stdout and stderr
#  to see load progress
./runBlazegraph.sh

# this also takes a long while (couple of days for one billion triples);
#  note that this call might give a timeout, but since the server received
#   it will continue loading without issue it seems 
./loadRestAPI.sh -n wdq -d ~/wikidata/split

# note: relative paths may be lost as the call is via HTTP
#  so if it returns immediately, give full path for data location (-d)

# the data are stored in
# ~/wikidata-query-rdf/dist/target/service-*/wikidata.jnl

# test query
#   curl -X POST http://localhost:9999/bigdata/namespace/wdq/sparql --data-urlencode 'query=SELECT * WHERE { ?s ?p ?o } LIMIT 1'

# if you want to build multiple indexes in different namespaces
#  you have to copy the wikidata.jnl file elsewhere and when you 
#    call loadRestApi.sh, a new one will be created.
# then you can swap the wikidata.jnl files for querying

# when running queries if you see messages about com.bigdata.rwstore.sector.MemoryManagerClosedException: null
#  this seems to be a known issue and (presumably) can be ignored
#     https://phabricator.wikimedia.org/T198046